notebook.community

Edit and run



In [8]:

    
import numpy as np 
import pandas as pd 

# from subprocess import check_output
# print(check_output(["ls", "../data/"]).decode("utf8"))



In [9]:

    
train = pd.read_csv('../data/train.csv',
                    dtype={'is_booking':bool,'srch_destination_id':np.int32, 'hotel_cluster':np.int32},
                    usecols=['srch_destination_id','is_booking','hotel_cluster'],
                    chunksize=1000000)

# dtype을 설정하고(bool, np.int32) chunksize로 끊어서 하면 더 빠르게 데이터를 처리할 수 있음!!


aggs = []
print('-'*38)
for chunk in train:
    agg = chunk.groupby(['srch_destination_id',
                         'hotel_cluster'])['is_booking'].agg(['sum','count'])
    agg.reset_index(inplace=True)
    aggs.append(agg)
    print('.',end='')
print('')
aggs = pd.concat(aggs, axis=0)









    



--------------------------------------
......................................






    Out[9]:






  
    
      
      srch_destination_id
      hotel_cluster
      sum
      count
    
  
  
    
      0
      1
      20
      0.0
      2
    
    
      1
      1
      30
      0.0
      1
    
    
      2
      1
      60
      0.0
      2
    
    
      3
      4
      22
      1.0
      2
    
    
      4
      4
      25
      1.0
      2



In [14]:

    
CLICK_WEIGHT = 0.05
agg = aggs.groupby(['srch_destination_id','hotel_cluster']).sum().reset_index()
agg.head()









    Out[14]:






  
    
      
      srch_destination_id
      hotel_cluster
      sum
      count
    
  
  
    
      0
      0
      3
      0.0
      2
    
    
      1
      1
      20
      4.0
      26
    
    
      2
      1
      30
      2.0
      22
    
    
      3
      1
      57
      0.0
      1
    
    
      4
      1
      60
      0.0
      17



In [15]:

    
agg['count'] -= agg['sum']
# sum은 실제 booking과 count를 합친 것..!
agg = agg.rename(columns={'sum':'bookings','count':'clicks'})
agg['relevance'] = agg['bookings'] + CLICK_WEIGHT * agg['clicks']
agg.head()









    Out[15]:






  
    
      
      srch_destination_id
      hotel_cluster
      bookings
      clicks
      relevance
    
  
  
    
      0
      0
      3
      0.0
      2.0
      0.10
    
    
      1
      1
      20
      4.0
      22.0
      5.10
    
    
      2
      1
      30
      2.0
      20.0
      3.00
    
    
      3
      1
      57
      0.0
      1.0
      0.05
    
    
      4
      1
      60
      0.0
      17.0
      0.85



In [16]:

    
def most_popular(group, n_max=5):
    relevance = group['relevance'].values
    hotel_cluster = group['hotel_cluster'].values
    most_popular = hotel_cluster[np.argsort(relevance)[::-1]][:n_max]
    return np.array_str(most_popular)[1:-1] # remove square brackets



In [17]:

    
%%time
most_pop = agg.groupby(['srch_destination_id']).apply(most_popular)
most_pop = pd.DataFrame(most_pop).rename(columns={0:'hotel_cluster'})
most_pop.head()









    



Wall time: 18.9 s



In [21]:

    
%%time
test = pd.read_csv('../data/test.csv',
                    dtype={'srch_destination_id':np.int32},
                    usecols=['srch_destination_id'],)









    



Wall time: 1.88 s



In [22]:

    
test.head()









    Out[22]:






  
    
      
      srch_destination_id
    
  
  
    
      0
      12243
    
    
      1
      14474
    
    
      2
      11353
    
    
      3
      8250
    
    
      4
      11812



In [23]:

    
test = test.merge(most_pop, how='left',left_on='srch_destination_id',right_index=True)
test.head()









    Out[23]:






  
    
      
      srch_destination_id
      hotel_cluster
    
  
  
    
      0
      12243
      5 55 37 11 22
    
    
      1
      14474
      5
    
    
      2
      11353
      0 31 77 91 96
    
    
      3
      8250
      1 45 79 24 54
    
    
      4
      11812
      91 42  2 48 59



In [32]:

    
test[test["hotel_cluster"].isnull() == True].head()









    Out[32]:






  
    
      
      srch_destination_id
      hotel_cluster
    
  
  
    
      286
      65671
      NaN
    
    
      357
      13679
      NaN
    
    
      445
      44373
      NaN
    
    
      458
      65106
      NaN
    
    
      627
      51983
      NaN



In [24]:

    
test["hotel_cluster"].isnull().sum()









    Out[24]:





14036



In [33]:

    
# 이 친구들은 전반적으로 인기 있는 것들을 그냥 추천

most_pop_all = agg.groupby('hotel_cluster')['relevance'].sum().nlargest(5).index
#nlargest(5) -> ~을 기반으로 가장 큰 것을 추천하는 함수
most_pop_all = np.array_str(most_pop_all)[1:-1]
most_pop_all









    Out[33]:





'91 48 42 59 28'



In [34]:

    
test["hotel_cluster"].fillna(most_pop_all,inplace=True)



In [35]:

    
test.head()









    Out[35]:






  
    
      
      srch_destination_id
      hotel_cluster
    
  
  
    
      0
      12243
      5 55 37 11 22
    
    
      1
      14474
      5
    
    
      2
      11353
      0 31 77 91 96
    
    
      3
      8250
      1 45 79 24 54
    
    
      4
      11812
      91 42  2 48 59



In [37]:

    
%%time
test["hotel_cluster"].to_csv('predicted_with_pandas.csv',header=True, index_label='id')









    



Wall time: 6.01 s

Public score : 0.30340



In [ ]:

	srch_destination_id	hotel_cluster	bookings	clicks	relevance
0	0	3	0.0	2.0	0.10
1	1	20	4.0	22.0	5.10
2	1	30	2.0	20.0	3.00
3	1	57	0.0	1.0	0.05
4	1	60	0.0	17.0	0.85

	srch_destination_id	hotel_cluster
0	12243	5 55 37 11 22
1	14474	5
2	11353	0 31 77 91 96
3	8250	1 45 79 24 54
4	11812	91 42 2 48 59